Basic Galaxies Demo

Distribution Functions

k1 <- 100

b_norm <- G_normls(s = 4, S = 2, a = 21, A = 21, w = 1 / 2, W = 50)
s_dp <- Sq_dirichlet(c = 1, C = 2)
s_py <- Sq_pitmanyor(-1, m = 8L)
s_gn <- Sq_gnedin0(0.5)

res_mdp <- gibbsmix(gal, k1, b_norm, s_dp)
res_fdp <- seqre(res_mdp)
res_mpy <- gibbsmix(gal, k1, b_norm, s_py)
res_fpy <- seqre(res_mpy)
res_mgn <- gibbsmix(gal, k1, b_norm, s_gn)
res_fgn <- seqre(res_mgn)

plot(res_mdp, func = 'density') +
  ggtitle('Marginal DP: Galaxies density function')

ggsave('figs/mdp_pdf.png', width = 6, height = 4)
plot(res_mdp, func = 'distribution') +
  ggtitle('Marginal DP: Galaxies distribution function')

ggsave('figs/mdp_cdf.png', width = 6, height = 4)
plot(res_mdp, func = 'distribution', confint = 0.95) +
  ggtitle('Marginal DP: Galaxies distribution function')

ggsave('figs/mdp_cdf95.png', width = 6, height = 4)

plot(res_fdp, func = 'density') +
  ggtitle('Full DP: Galaxies density function')

ggsave('figs/pol_pdf.png', width = 6, height = 4)
plot(res_fdp, func = 'distribution') +
  ggtitle('Full DP: Galaxies distribution function')

ggsave('figs/pol_cdf.png', width = 6, height = 4)
plot(res_fdp, func = 'distribution', confint = 0.95) +
  ggtitle('Full DP: Galaxies distribution function')

ggsave('figs/pol_cdf95.png', width = 6, height = 4)

plot(res_mgn, func = 'density') +
  ggtitle('Marginal Gnedin: Galaxies density function')

ggsave('figs/mgn_pdf.png', width = 6, height = 4)
plot(res_mgn, func = 'distribution') +
  ggtitle('Marginal Gnedin: Galaxies distribution function')

ggsave('figs/mgn_cdf.png', width = 6, height = 4)
plot(res_mgn, func = 'distribution', confint = 0.95) +
  ggtitle('Marginal Gnedin: Galaxies distribution function')

ggsave('figs/mgn_cdf95.png', width = 6, height = 4)

plot(res_fgn, func = 'density') +
  ggtitle('Full Gnedin: Galaxies density function')

ggsave('figs/fgn_pdf.png', width = 6, height = 4)
plot(res_fgn, func = 'distribution') +
  ggtitle('Full Gnedin: Galaxies distribution function')

ggsave('figs/fgn_cdf.png', width = 6, height = 4)
plot(res_fgn, func = 'distribution', confint = 0.95) +
  ggtitle('Full Gnedin: Galaxies distribution function')

ggsave('figs/fgn_cdf95.png', width = 6, height = 4)

plot(res_mpy, func = 'density') +
  ggtitle('Marginal PY: Galaxies density function')

ggsave('figs/mpy_pdf.png', width = 6, height = 4)
plot(res_mpy, func = 'distribution') +
  ggtitle('Marginal PY: Galaxies distribution function')

ggsave('figs/mpy_cdf.png', width = 6, height = 4)
plot(res_mpy, func = 'distribution', confint = 0.95) +
  ggtitle('Marginal PY: Galaxies distribution function')

ggsave('figs/mpy_cdf95.png', width = 6, height = 4)

plot(res_fpy, func = 'density') +
  ggtitle('Full PY: Galaxies density function')

ggsave('figs/fpy_pdf.png', width = 6, height = 4)
plot(res_fpy, func = 'distribution') +
  ggtitle('Full PY: Galaxies distribution function')

ggsave('figs/fpy_cdf.png', width = 6, height = 4)
plot(res_fpy, func = 'distribution', confint = 0.95) +
  ggtitle('Full PY: Galaxies distribution function')

ggsave('figs/fpy_cdf95.png', width = 6, height = 4)

Cluster Count Distributions

k2 <- 1000

res_mdp <- gibbsmix(gal, k2, b_norm, s_dp)
res_fdp <- seqre(res_mdp)
res_mpy <- gibbsmix(gal, k2, b_norm, s_py)
res_fpy <- seqre(res_mpy)
res_mgn <- gibbsmix(gal, k2, b_norm, s_gn)
res_fgn <- seqre(res_mgn)

nm_mdp <- sapply(modes(res_mdp, mean = FALSE), length)
nm_fdp <- sapply(modes(res_fdp, mean = FALSE), length)
nm_mgn <- sapply(modes(res_mgn, mean = FALSE), length)
nm_fgn <- sapply(modes(res_fgn, mean = FALSE), length)
nm_mpy <- sapply(modes(res_mpy, mean = FALSE), length)
nm_fpy <- sapply(modes(res_fpy, mean = FALSE), length)

nk_mdp <- sapply(res_mdp$phi, nrow)
nk_fdp <- sapply(res_fdp$phi, nrow)
nk_mgn <- sapply(res_mgn$phi, nrow)
nk_fgn <- sapply(res_fgn$phi, nrow)
nk_mpy <- sapply(res_mpy$phi, nrow)
nk_fpy <- sapply(res_fpy$phi, nrow)

name_fct <- factor(rep(rep(c('Marginal DP', 'Full DP',
                             'Marginal Gnedin', 'Full Gnedin',
                             'Marginal PY', 'Full PY'), each = k2), 2),
                   levels = c('Marginal DP', 'Marginal PY', 'Marginal Gnedin',
                              'Full DP', 'Full PY', 'Full Gnedin'))
df_clust <- data.frame(Name = name_fct,
                       Type = rep(c('Modes', 'Components'), each = 6 * k2),
                       Count = c(nm_mdp, nm_fdp, nm_mgn, nm_fgn, nm_mpy, nm_fpy,
                                 nk_mdp, nk_fdp, nk_mgn, nk_fgn, nk_mpy, nk_fpy)
                       )

ggplot(df_clust, aes(x = Count)) +
  geom_histogram(aes(y = ..density..), binwidth = 1) +
  facet_grid(rows = vars(Name), cols = vars(Type), scales = 'free') +
  ylab(element_blank()) +
  theme_bw()
## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.

ggsave('figs/clust_counts.png', width = 6, height = 8)

Validation with Prior Draws

Functions to Draw from the MDP

rDPnorm <- function(n, alpha = 1, mu = 21, tau = 25, s = 4, S = 2,
                 c = 2, C = 4, a = 21, A = 21, w = 1, W = 100,
                 fix_a = FALSE, fix_m = FALSE, fix_t = FALSE,
                 eps = 0.05) {
  if (fix_a) {
    alpha <- rep(alpha, n)
  } else {
    alpha <- rgamma(n, c / 2, scale = C / 2)
  }
  if (fix_m) {
    mu <- rep(mu, n)
  } else {
    mu <- rnorm(n, a, sqrt(A))
  }
  if (fix_t) {
    tau <- rep(tau, n)
  } else {
    tau <- rgamma(n, w / 2, scale = W / 2)
  }
  rG0 <- sapply(1:n, function(i)
    function(nn) {
      return(matrix(c(rnorm(nn, mu[i], tau[i]),
                      rgamma(nn, s / 2, scale = S / 2)), ncol = 2))
      })
  M <- sapply(alpha,
              function(alph) 1.0 + qpois(0.95, -(alph) * log(eps)))
  phi <- function(m, alph, rG) {
    v <- rbeta(m, 1, alph)
    w <- v
    if (m > 1) {
      w[2:m] <- v[2:m] * cumprod(1 - v[1:(m - 1)])
    }
    mat <- cbind(w, rG(m))
    mat <- rbind(mat, c(1 - sum(mat[, 1]), rG(1)))
    colnames(mat) <- c('w', 'mean', 'var')
    return(mat)
  }
  return(sapply(1:n, function(nn) phi(M[nn], alpha[nn], rG0[[nn]])))
}

evalDPnorm <- function(obj, grd, func = 'density',
                       nthreads = parallel::detectCores()) {
  mdp <- list()
  mdp$phi <- obj
  class(mdp) <- 'mdpolya_result'
  return(grideval(mdp, grd, func, nthreads))
}

rmixnorm <- function(n, w, mean, var) {
  k <- sample(1:length(w), n, replace = TRUE, prob = w)
  return(list(y = rnorm(n, mean = mean[k], sd = sqrt(var[k])), k = k))
}

Repeated Simulation

m <- 5
n <- 250
k <- 100
mdps <- rDPnorm(m)
synth <- lapply(mdps, function(x) rmixnorm(n, x[, 1], x[, 2], x[, 3]))
mdps_trunc <- lapply(1:m, function(mm) {
  out <- mdps[[mm]][sort(unique(synth[[mm]]$k)), ]
  if (!is.matrix(out)) {
    out <- t(as.matrix(out))
  }
  out[, 1] <- table(synth[[mm]]$k) / n
  return(out)
})
res_mdp <- lapply(synth, function(s) {
  gibbsmix(s$y, k, b_norm, s_dp)
})
res_mdp_g <- lapply(res_mdp, grideval, func = 'distribution')
sapply(1:m, function(mm) dir.create(paste0('figs/', mm)))
## Warning in dir.create(paste0("figs/", mm)): 'figs/1' already exists
## Warning in dir.create(paste0("figs/", mm)): 'figs/2' already exists
## Warning in dir.create(paste0("figs/", mm)): 'figs/3' already exists
## Warning in dir.create(paste0("figs/", mm)): 'figs/4' already exists
## Warning in dir.create(paste0("figs/", mm)): 'figs/5' already exists
## [1] FALSE FALSE FALSE FALSE FALSE
for (mm in 1:m) {
  obj <- res_mdp_g[[mm]]
  grd <- obj$grid
  df <- data.frame(Value = rep(grd, 2),
                   K = rep(c('True', 'Truncated'), each = length(grd)),
                   X = c(evalDPnorm(mdps[mm], grd),
                         evalDPnorm(mdps_trunc[mm], grd)))
  p <- ggplot(df, aes(x = Value, y = X, group = K)) +
    ylab('Density') +
    geom_line(data = df[df$K == 'True', ],
              color = 'deepskyblue4') +
    geom_line(data = df[df$K == 'Truncated', ],
              color = 'deepskyblue2',
              linetype = 'dashed') +
    theme_bw() +
    theme(legend.position = 'none')
  n <- length(obj$args$data)
  p <- p + geom_point(data = data.frame(Value = rep(obj$args$data, 2) +
                                          runif(n, -0.001, 0.001),
                                        X = runif(n, -max(df$X) / 50, 0),
                                        K = 0),
                      shape = 16, size = 0.5, alpha = 0.5)
  ggsave(paste0('figs/', mm, '/density.png'), width = 6, height = 4)
  df <- data.frame(Value = rep(grd, 2),
                   K = rep(c('True', 'Truncated'), each = length(grd)),
                   X = c(evalDPnorm(mdps[mm], grd, func = 'distribution'),
                         evalDPnorm(mdps_trunc[mm], grd,
                                    func = 'distribution')))
  p <- ggplot(df, aes(x = Value, y = X, group = K)) +
    ylab('Distribtion') +
    geom_line(data = df[df$K == 'True', ],
              color = 'deepskyblue4') +
    geom_line(data = df[df$K == 'Truncated', ],
              color = 'deepskyblue2',
              linetype = 'dashed') +
    theme_bw() +
    theme(legend.position = 'none')
  p <- p + stat_function(fun = ecdf(obj$args$data), aes(group = 0),
                         geom = 'step', n = 1001)
  err_int <- 1 - 0.95
  eps_dkw <- sqrt(log(2 / err_int) / (2 * n))
  upper_dkw <- approxfun(obj$grid, c(pmin(df$X[1:length(obj$grid)] + eps_dkw, 1)))
  lower_dkw <- approxfun(obj$grid, c(pmax(df$X[1:length(obj$grid)] - eps_dkw, 0)))
  eps_clt <- qnorm(1 - (err_int / 2)) *
    sqrt(df$X[1:length(obj$grid)] * (1 - df$X[1:length(obj$grid)]) / n)
  upper_clt <- approxfun(obj$grid, c(df$X[1:length(obj$grid)] + eps_clt))
  lower_clt <- approxfun(obj$grid, c(df$X[1:length(obj$grid)] - eps_clt))
  p <- p +
    stat_function(fun = upper_clt, aes(group = 0), geom = 'step', n = 1001,
                  size = 0.25, alpha = 0.5) +
    stat_function(fun = lower_clt, aes(group = 0), geom = 'step', n = 1001,
                  size = 0.25, alpha = 0.5) +
    stat_function(fun = lower_dkw, aes(group = 0), geom = 'step', n = 1001,
                  size = 0.25, color = 'grey50', linetype = 'longdash',
                  alpha = 0.5) +
    stat_function(fun = upper_dkw, aes(group = 0), geom = 'step', n = 1001,
                  size = 0.25, color = 'grey50', linetype = 'longdash',
                  alpha = 0.5)
  ggsave(paste0('figs/', mm, '/distribution.png'), width = 6, height = 4)
}
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
res_pol <- lapply(res_mdp, seqre)
res_pol_g <- lapply(res_pol, grideval, func = 'distribution')
for (mm in 1:m) {
  p <- plot(res_mdp_g[[mm]], confint = 0.95) +
    geom_line(data = data.frame(Value = res_mdp_g[[mm]]$grid,
                                X = c(evalDPnorm(mdps[mm],
                                                 res_mdp_g[[mm]]$grid,
                                                 func = 'distribution')),
                                K = 0),
              color = 'deepskyblue4') +
    geom_line(data = data.frame(Value = res_mdp_g[[mm]]$grid,
                                X = c(evalDPnorm(mdps_trunc[mm],
                                                 res_mdp_g[[mm]]$grid,
                                                 func = 'distribution')),
                                K = 0),
              color = 'deepskyblue2',
              linetype = 'dashed')
  ggsave(paste0('figs/', mm, '/mdp.png'), width = 6, height = 4)
  p <- plot(res_pol_g[[mm]], confint = 0.95) +
    geom_line(data = data.frame(Value = res_pol_g[[mm]]$grid,
                                X = c(evalDPnorm(mdps[mm],
                                                 res_pol_g[[mm]]$grid,
                                                 func = 'distribution')),
                                K = 0),
              color = 'deepskyblue4') +
    geom_line(data = data.frame(Value = res_pol_g[[mm]]$grid,
                                X = c(evalDPnorm(mdps_trunc[mm],
                                                 res_pol_g[[mm]]$grid,
                                                 func = 'distribution')),
                                K = 0),
              color = 'deepskyblue2',
              linetype = 'dashed')
  ggsave(paste0('figs/', mm, '/pol.png'), width = 6, height = 4)
  mdp_mom <- data.frame(Mean = moments(res_mdp[[mm]], 1),
                        Variance = moments(res_mdp[[mm]], 2))
  mdp_mom_ <- mdp_mom
  while (nrow(mdp_mom_) >= 0.95 * nrow(mdp_mom)) {
    mdp_mom_h <- chull(mdp_mom_)
    mdp_mom_ <- mdp_mom_[-c(mdp_mom_h), ]
  }
  mdp_mom_h <- mdp_mom_[chull(mdp_mom_), ]
  pol_mom <- data.frame(Mean = moments(res_pol[[mm]], 1),
                        Variance = moments(res_pol[[mm]], 2))
  pol_mom_ <- pol_mom
  while (nrow(pol_mom_) >= 0.95 * nrow(pol_mom)) {
    pol_mom_h <- chull(pol_mom_)
    pol_mom_ <- pol_mom_[-c(pol_mom_h), ]
  }
  pol_mom_h <- pol_mom_[chull(pol_mom_), ]
  grd <- res_pol_g[[mm]]$grid
  full <- evalDPnorm(mdps[mm], grd, func = 'density')
  full_m <- pracma::trapz(grd, grd * full)
  full <- data.frame(Mean = full_m,
                     Variance = pracma::trapz(grd, (grd - full_m) ^ 2 * full))
  trnc <- evalDPnorm(mdps_trunc[mm], grd, func = 'density')
  trnc_m <- pracma::trapz(grd, grd * trnc)
  trnc <- data.frame(Mean = trnc_m,
                     Variance = pracma::trapz(grd, (grd - trnc_m) ^ 2 * trnc))
  p <- ggplot(mdp_mom_h, aes(x = Mean, y = Variance)) +
    # geom_point(data = pol_mom, alpha = 0.5, pch = 16, color = 'springgreen2') +
    geom_polygon(data = pol_mom_h, alpha = 0.25, size = 0.5,
                 color = 'springgreen2', fill = 'springgreen2') +
    # geom_point(alpha = 0.5, pch = 16, color = 'springgreen4') +
    geom_polygon(data = mdp_mom_h, alpha = 0.25, size = 0.5,
                 color = 'springgreen4', fill = 'springgreen4') +
    geom_point(data = full, pch = 17, size = 3, col = 'deepskyblue4') +
    geom_point(data = trnc, pch = 18, size = 3, col = 'deepskyblue2') +
    geom_point(data = data.frame(Mean = mean(synth[[mm]]$y),
                                 Variance = var(synth[[mm]]$y)),
               pch = 19, size = 2) +
    theme_bw()
  ggsave(paste0('figs/', mm, '/moms.png'), width = 6, height = 6)
}